Prepare libraries

library(DESeq2)
library(data.table)
library(gdata)
library(rtracklayer)
library(BSgenome)
library(VennDiagram)
library(ggplot2)
library(biomaRt)
library(RColorBrewer)
library(readr)
library(ensembldb)
library(EnsDb.Mmusculus.v79)
library(AnnotationDbi)
library(tidyverse)
library(plotly)

Load dataset information

dir.create("PDF_figure/T6B_derepression_CDF_merged", showWarnings = FALSE)

peaks.mir <- readRDS("Datafiles/merged-peaks-mirs-200-12232019-withID.rds")
mirs.peaks <- readRDS("Datafiles/miRNA-merged-peaks-list-12232019-withIDs.rds")

T6B_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/T6B mouse colon/Result/Differential Analysis.csv")
## New names:
## * `` -> ...1
## Rows: 12563 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(T6B_DGE)[1] <- "gene"
colnames(T6B_DGE)[3] <- "log2FC"

CDF plotting

Set-up CDF plotting function

plotCDF.ggplot <- function(gene.counts, gene.sets, gene.set.labels,
                           col = "", linetype = "", xlim = c( -1.0, 1.3 ),
                           legend.size = 22, axistitle.size = 22, title = "Fold change log2 (Dicer KO/WT)",
                           legend.pos = c(0.7, 0.18)) {
  require(ggplot2)
  df.log2fc <- gene.counts[,c("gene", "log2FC")]
  #rownames(df.log2fc) <- df.log2fc$gene
  if (length(gene.sets) != length(gene.set.labels)){
    return("Length of gene sets doesn't match labels")
  }
  target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
  for (i in 2:length(gene.sets)){
    target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
  }

  gene.set.counts <- c()
  for (j in 1:length(gene.sets)){
    gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
  }
  
  target.expr$Category <- rep(gene.set.labels, gene.set.counts)
  target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
  
  log2FC.values <- lapply(gene.sets, function(gene.set) {
    gene.counts[gene.counts$gene %in% gene.set,]$log2FC
  })

  ks.pvals <- lapply(log2FC.values,
                     function(log2FCs) {
                       ks.test(log2FCs, log2FC.values[[1]])$p.value
                     })
  
  p <- ggplot( target.expr, aes( x = log2FC, colour = Category ) ) +
  stat_ecdf( geom = 'step', aes( colour = Category, linetype = Category ), lwd = 1 ) +
  scale_color_manual( values = col, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
  scale_linetype_manual( values = linetype, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
  # xlim() will remove data points; Be careful in the future
  coord_cartesian( xlim = xlim ) + xlab(title) + ylab('CDF') +
  theme_classic() + theme( legend.background = element_rect(fill = NA), 
                           legend.title = element_blank(), 
                           legend.position = legend.pos,
                           legend.text = element_text(size=legend.size),
                           legend.key.size = unit(1.5, 'lines'),
                           axis.title.x = element_text(size=axistitle.size, margin = margin(t = 10)),
                           axis.title.y = element_text(size=axistitle.size, margin = margin(r = 10)),
                           axis.text=element_text(size=20),
                           axis.line = element_line(size = 1),  #axis label size
                           axis.ticks.length = unit(0.3, "cm")) #increase tick length
  
  for (k in 2:length(gene.sets)){
    p <- p + annotate(geom = "text", x = -0.9, y = 1-0.08*(k-1), hjust = 0, 
                      label = sprintf("p = %.0e", ks.pvals[k]), 
                      colour = col[k], size = 8)
  }
  print(p)
}

Plotting

cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(T6B_DGE,
               list(T6B_DGE$gene, peaks.mir$target_Ensembl_ID),
               c("All genes", "miRNAs over 200"),
               col = c("grey15", cols[1]),
               linetype = c(1, 1),
               title = "RNA_LFC(T6B/control)",
               legend.size = 15
               )
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

pdf("PDF_figure/T6B_derepression_CDF_merged/T6B_CDF_miRNAover200.pdf",
    height = 8,
    width = 10)
plotCDF.ggplot(T6B_DGE,
               list(T6B_DGE$gene, peaks.mir$target_Ensembl_ID),
               c("All genes", "miRNAs over 200"),
               col = c("grey15", cols[1]),
               linetype = c(1, 1),
               title = "RNA_LFC(T6B/control)",
               legend.size = 15
               )
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen 
##                 2
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)

for (i in 1:10) {
  plotCDF.ggplot(T6B_DGE,
               list(T6B_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
               c("All genes", mirna[i]),
               col = c("grey15", cols[i]),
               linetype = c(1, 1),
               title = "RNA_LFC(T6B/control)",
               legend.size = 15
               )
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

pdf("PDF_figure/T6B_derepression_CDF_merged/T6B_CDF_top10miRNA.pdf",
    height = 8,
    width = 10)
for (i in 1:10) {
  plotCDF.ggplot(T6B_DGE,
               list(T6B_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
               c("All genes", mirna[i]),
               col = c("grey15", cols[i]),
               linetype = c(1, 1),
               title = "RNA_LFC(T6B/control)",
               legend.size = 15
               )
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen 
##                 2

CDF z-score

Prepare function

zscore.cal <- function(genes = NA, index = NA, expression.dataset){
  if (sum(!is.na(genes)) > 0 && is.na(index)){
    gene.set <- expression.dataset[expression.dataset$gene %in% genes,]
  } else if (!is.na(index)) {
    gene.set <- expression.dataset[index,]
  } else {
    print("Please input gene names or row index.")
  }
  mu <- mean(expression.dataset$log2FC)
  Sm <- mean(gene.set$log2FC)
  m <- length(genes)
  SD <- sd(expression.dataset$log2FC)
  z <- (Sm-mu)*sqrt(m)/SD
  return(z)
}

Calculation

zscores.all <- as.data.frame(sapply(mirs.peaks,
                              function(targets){
                                  zscore.cal(genes = targets$target_Ensembl_ID, expression.dataset = T6B_DGE)
                            }))
colnames(zscores.all) <- c("z.score")
lens <- as.data.frame(sapply(mirs.peaks, function(x) length(x)))
colnames(lens) <- "N"
zscores.all <- cbind(zscores.all, lens$N)

mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
zscores.all <- as_tibble(zscores.all, rownames = "miR.family")
zscores.all <- inner_join(zscores.all, mirna.family.DGE[,c(1,7)], by = c("miR.family" = "miR.family"))
colnames(zscores.all)[3] <- "N"

Plotting

p1 <- ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N, label = miR.family)) +
  geom_point(colour = "#15CD40", alpha = 0.6) +
  xlab("T6B RNA-Seq Z-score") +
  ylab("miRNA family abundance") +
  theme_bw() +
      theme(panel.border = element_rect(),
      panel.background = element_blank(),
      panel.grid.major = element_line(), 
      panel.grid.minor = element_line(),
      axis.title.x = element_text(size=14, margin = margin(t = 10)),
      axis.title.y = element_text(size=14, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_line(size = 0.5),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

cor.test(zscores.all$z.score, log2(zscores.all$baseMean))
## 
##  Pearson's product-moment correlation
## 
## data:  zscores.all$z.score and log2(zscores.all$baseMean)
## t = 5.1337, df = 73, p-value = 2.27e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3262018 0.6643385
## sample estimates:
##      cor 
## 0.515033
pdf("PDF_figure/T6B_derepression_CDF_merged/T6B_CDF_zscore.pdf",
    height = 4,
    width = 6)
p1
dev.off()
## quartz_off_screen 
##                 2
ggplotly(p1)

Session Info

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.10.0               forcats_0.5.1              
##  [3] stringr_1.4.0               dplyr_1.0.7                
##  [5] purrr_0.3.4                 tidyr_1.1.4                
##  [7] tibble_3.1.6                tidyverse_1.3.1            
##  [9] EnsDb.Mmusculus.v79_2.99.0  ensembldb_2.16.4           
## [11] AnnotationFilter_1.16.0     GenomicFeatures_1.44.2     
## [13] AnnotationDbi_1.54.1        readr_2.1.0                
## [15] RColorBrewer_1.1-2          biomaRt_2.48.3             
## [17] ggplot2_3.3.5               VennDiagram_1.7.0          
## [19] futile.logger_1.4.3         BSgenome_1.60.0            
## [21] Biostrings_2.60.2           XVector_0.32.0             
## [23] rtracklayer_1.52.1          gdata_2.18.0               
## [25] data.table_1.14.2           DESeq2_1.32.0              
## [27] SummarizedExperiment_1.22.0 Biobase_2.52.0             
## [29] MatrixGenerics_1.4.3        matrixStats_0.61.0         
## [31] GenomicRanges_1.44.0        GenomeInfoDb_1.28.4        
## [33] IRanges_2.26.0              S4Vectors_0.30.2           
## [35] BiocGenerics_0.38.0        
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-2         rjson_0.2.20             ellipsis_0.3.2          
##   [4] fs_1.5.0                 rstudioapi_0.13          farver_2.1.0            
##   [7] bit64_4.0.5              lubridate_1.8.0          fansi_0.5.0             
##  [10] xml2_1.3.2               splines_4.1.1            cachem_1.0.6            
##  [13] geneplotter_1.70.0       knitr_1.36               jsonlite_1.7.2          
##  [16] Rsamtools_2.8.0          broom_0.7.10             annotate_1.70.0         
##  [19] dbplyr_2.1.1             png_0.1-7                compiler_4.1.1          
##  [22] httr_1.4.2               backports_1.4.0          assertthat_0.2.1        
##  [25] Matrix_1.3-4             fastmap_1.1.0            lazyeval_0.2.2          
##  [28] cli_3.1.0                formatR_1.11             htmltools_0.5.2         
##  [31] prettyunits_1.1.1        tools_4.1.1              gtable_0.3.0            
##  [34] glue_1.5.0               GenomeInfoDbData_1.2.6   rappdirs_0.3.3          
##  [37] Rcpp_1.0.7               cellranger_1.1.0         jquerylib_0.1.4         
##  [40] vctrs_0.3.8              crosstalk_1.2.0          xfun_0.28               
##  [43] rvest_1.0.2              lifecycle_1.0.1          restfulr_0.0.13         
##  [46] gtools_3.9.2             XML_3.99-0.8             zlibbioc_1.38.0         
##  [49] scales_1.1.1             vroom_1.5.6              hms_1.1.1               
##  [52] ProtGenerics_1.24.0      lambda.r_1.2.4           yaml_2.2.1              
##  [55] curl_4.3.2               memoise_2.0.1            stringi_1.7.5           
##  [58] RSQLite_2.2.8            highr_0.9                genefilter_1.74.1       
##  [61] BiocIO_1.2.0             filelock_1.0.2           BiocParallel_1.26.2     
##  [64] rlang_0.4.12             pkgconfig_2.0.3          bitops_1.0-7            
##  [67] evaluate_0.14            lattice_0.20-45          labeling_0.4.2          
##  [70] htmlwidgets_1.5.4        GenomicAlignments_1.28.0 bit_4.0.4               
##  [73] tidyselect_1.1.1         magrittr_2.0.1           R6_2.5.1                
##  [76] generics_0.1.1           DelayedArray_0.18.0      DBI_1.1.1               
##  [79] haven_2.4.3              pillar_1.6.4             withr_2.4.2             
##  [82] survival_3.2-13          KEGGREST_1.32.0          RCurl_1.98-1.5          
##  [85] modelr_0.1.8             crayon_1.4.2             futile.options_1.0.1    
##  [88] utf8_1.2.2               BiocFileCache_2.0.0      tzdb_0.2.0              
##  [91] rmarkdown_2.11           progress_1.2.2           readxl_1.3.1            
##  [94] locfit_1.5-9.4           blob_1.2.2               reprex_2.0.1            
##  [97] digest_0.6.28            xtable_1.8-4             munsell_0.5.0           
## [100] viridisLite_0.4.0